2  Model Validation

Author

Lin Yu

Published

January 27, 2025

library(dplyr)
library(ggplot2)
library(DT)
library(here)

2.1 Recap of tutorial 1

Understand random variable through the following simulation:

n_sample <- 1000
beta0 <-  0.5
beta1 <- 2
x <- rnorm(n_sample,mean = 5, sd = 10)

random_error <- rnorm(n_sample,mean = 0, sd = 5)

y <- beta0 + beta1*x + random_error

pop_dat <- data.frame(y = y,
           x = x)

pop_dat %>% 
  ggplot(aes(x = x, y = y))+
  geom_point()+
  theme_bw()

observed standard deviation of the data:

pop_dat %>% 
  ggplot(aes(y))+
  geom_histogram()+
  theme_bw()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

sd(pop_dat$y)
[1] 20.08596
sample_size <- 100
sample_dat <- pop_dat[sample(c(1:sample_size),replace = FALSE),]

sample_dat %>% datatable()
sample_dat %>% 
  ggplot(aes(x = x, y = y))+
  geom_point()+
  theme_bw()

the standard deviation of the estimator (\(\hat\beta\)):

glm_m <- glm(y~x, data= sample_dat)

set.seed(1017)
## lapply parallel computing: more effecient than a for loop:)

coef_dat <- lapply(1:100, function(i){
  sample_dat <- pop_dat[sample(100,replace = FALSE),]
  glm_m <- glm(y~x, data= sample_dat)
  coef(glm_m)[2]
})

coef_dat %>% head()
[[1]]
      x 
1.94098 

[[2]]
      x 
1.94098 

[[3]]
      x 
1.94098 

[[4]]
      x 
1.94098 

[[5]]
      x 
1.94098 

[[6]]
      x 
1.94098 
set.seed(1017) 
beta_hat_dat <- lapply(1:100, function(i) {
  sample_dat <- pop_dat[sample(seq_len(nrow(pop_dat)), ## sample from 1: n_sample 
                               sample_size, ## sample size
                               replace = FALSE), ]
  glm_m <- glm(y ~ x, data = sample_dat)
  coef(glm_m)[2]
}) %>% unlist()

beta_hat_dat %>%
  data.frame() %>% 
  ggplot(aes(.)) +
  geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

beta_hat <- NULL
for(sample_size in c(20,100,200,500,1000)){
  beta_hat_tmp <- lapply(1:100, function(i) {
  sample_dat <- pop_dat[sample(seq_len(nrow(pop_dat)), ## sample from 1: n_sample 
                               sample_size, ## sample size
                               replace = FALSE), ]
  glm_m <- glm(y ~ x, data = sample_dat)
  coef(glm_m)[2]
}) %>% unlist() 
  beta_hat <- cbind(beta_hat,beta_hat_tmp )
}

beta_hat <- beta_hat %>% data.frame()

colnames(beta_hat) <- paste0("sample size = ",c(20,100,200,500,1000))

beta_hat %>% 
  datatable()
apply(beta_hat,2, sd)
  sample size = 20  sample size = 100  sample size = 200  sample size = 500 
      1.331442e-01       4.579778e-02       3.317768e-02       1.444121e-02 
sample size = 1000 
      2.500723e-15 

viz the distribution of \(\hat \beta\):

library(tidyr)
Warning: package 'tidyr' was built under R version 4.3.2
plot_dat <- beta_hat %>% 
  pivot_longer(cols = everything(),
    values_to = "beta_hat",
    names_to = "sample size",
    names_prefix = "sample size ="
               ) 
  
 plot_dat <- plot_dat %>%  mutate(`sample size` = as.numeric(`sample size`))

plot_dat$`sample size` <- factor(plot_dat$`sample size`, 
                                 levels = c("20","100", "200", "500", "1000"), 
                                 labels = c("20","100", "200", "500", "1000"))


  plot_dat %>% 
    ggplot(aes(x = beta_hat,color = `sample size`, fill = `sample size`)) +
    geom_histogram()+
    facet_grid(cols = vars(`sample size`),scales = "free_y")+
    theme_bw()+
    theme(legend.position = "bottom")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

2.2 Tutorial 2: Model Validation

Use the hwy2.RData file available from the course page. After attaching the rms package and doing the usual datadist()/options() task
1. Fit the following model

fit <- ols(rate~rcs(trks,4)+rcs(sigs1,4)+type+hwy,data=hwy2,x=TRUE,y=TRUE)
  1. run both the ordinary bootstrap validation and the .632 bootstrap validation on this model. compare the results.

rm(list = ls())
library(rms)
library(dplyr)
## change your working directory as necessary
load(here("data","hwy2.RData"))

str(hwy2)
'data.frame':   39 obs. of  13 variables:
 $ rate : num  4.58 2.86 3.02 2.29 1.61 6.87 3.85 6.12 3.29 5.88 ...
 $ len  : num  4.99 16.11 9.75 10.65 20.01 ...
 $ ADT  : int  69 73 49 61 28 30 46 25 43 23 ...
 $ trks : int  8 8 10 13 12 6 8 9 12 7 ...
 $ sigs1: num  0.2004 0.0621 0.1026 0.0939 0.05 ...
 $ slim : int  55 60 60 65 70 55 55 55 50 50 ...
 $ shld : int  10 10 10 10 10 10 8 10 4 5 ...
 $ lane : int  8 4 4 6 4 4 4 4 4 4 ...
 $ acpt : num  4.6 4.4 4.7 3.8 2.2 24.8 11 18.5 7.5 8.2 ...
 $ itg  : num  1.2 1.43 1.54 0.94 0.65 0.34 0.47 0.38 0.95 0.12 ...
 $ lwid : int  12 12 12 12 12 12 12 12 12 12 ...
 $ hwy  : Factor w/ 4 levels "FAI","MA","MC",..: 1 1 1 1 1 4 4 4 4 4 ...
 $ type : Factor w/ 2 levels "regular","major": 2 2 2 2 2 2 2 2 2 2 ...

When working with factor/nominal/categorical variables in R, preprocessing is necessary to ensure the data is properly handled by statistical models and algorithms. Many R functions automatically perform dummy coding when a variable is specified as a factor, typically using the as.factor() function.

Side note: Dummy coding (sometimes confused with one-hot encoding in CS) is just one way to code factor variables. It is popular because of its ease of interpretation.

A complete process of model validation usually consists of two parts: internal and external validation. Ideally (when you’re lucky 😄), you will have two independent datasets, such as EHR data from two hospitals. You will first build or ( train) and validate your model internally using data from hospital A (the train-validation dataset). Then the model can be sent to hospital B for external validation (test set). If you do not have two datasets, you can manually divide the data into two parts, with one part mimicking the external dataset.

  • Split-sample or split-sample averaged(SSA)

  • K-fold Cross-validation

  • Bootstrap: ordinary, .632

# sample size 100k
set.seed(1017)
n_sample <- 100000
patient_id <- c(1:n_sample)

# sample with replacement
sample_dat <- sample(patient_id,size=n_sample,replace = TRUE) 
# proportion of samples selected in bootstrap sampling with replacement
((sample_dat%>% unique() %>% length())/n_sample )%>% round(3)
[1] 0.632
h.dd <- datadist(hwy2)
options(datadist="h.dd")
fit <- ols(rate~rcs(trks,4)+
             rcs(sigs1,4)+type+hwy,
           data=hwy2,x=TRUE,y=TRUE)
fit$x %>% head()
  trks      trks'    trks''      sigs1       sigs1'      sigs1'' type=major
1    8 0.02777778 0.0000000 0.20040080 2.023540e-03 0.0004753511          1
2    8 0.02777778 0.0000000 0.06207325 7.616807e-07 0.0000000000          1
3   10 0.75000000 0.2222222 0.10256410 8.221905e-05 0.0000000000          1
4   13 4.50000000 2.2222222 0.09389671 4.716474e-05 0.0000000000          1
5   12 3.02777778 1.4074074 0.04997501 0.000000e+00 0.0000000000          1
6    6 0.00000000 0.0000000 2.00750419 1.016433e+00 0.7779110205          1
  hwy=MA hwy=MC hwy=PA
1      0      0      0
2      0      0      0
3      0      0      0
4      0      0      0
5      0      0      0
6      0      0      1
set.seed(1017)
validate(fit, B=100)

Divergence or singularity in 14 samples
          index.orig training   test optimism index.corrected  n
R-square      0.7110   0.7542 0.5195   0.2346          0.4764 86
MSE           1.1106   0.8411 1.8466  -1.0055          2.1161 86
g             1.8604   1.7997 1.6890   0.1107          1.7497 86
Intercept     0.0000   0.0000 0.4525  -0.4525          0.4525 86
Slope         1.0000   1.0000 0.8944   0.1056          0.8944 86
set.seed(1017)
validate(fit,method = ".632",B=100)

Divergence or singularity in 14 samples
          index.orig training    test optimism index.corrected  n
R-square      0.7110   0.7542 -0.0506   0.4814          0.2297 86
MSE           1.1106   0.8411  2.8875  -1.1230          2.2336 86
g             1.8604   1.7997  1.1931   0.4217          1.4386 86
Intercept     0.0000   0.0000  0.9201  -0.5815          0.5815 86
Slope         1.0000   1.0000  0.6309   0.2332          0.7668 86

Bootstrapping with replacement leads to some data points being repeated in the bootstrap sample. As a result, the model may end up ‘memorizing’ these repeated points, which can cause it to perform better.